In [2]:
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline
First of all the dataset must be loaded. The headers of the dataset will also be loaded according to the definitions of thte AVA dataset.
In [3]:
ava_header = ["row_number",
"image_id",
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
"Semantic Tag 1", "Semantic Tag 2",
"Challenge ID"]
ava_dataset = pd.read_table("AVA.txt", sep = " ", header=None, names = ava_header)
ava_dataset.head()
Out[3]:
In [4]:
weights = [1,2,3,4,5,6,7,8,9,10]
ones = [1,1,1,1,1,1,1,1,1,1]
ava_dataset["mean"] = ava_dataset.loc[:, '1':'10'].dot(weights) / ava_dataset.loc[:, '1':'10'].dot(ones)
In [5]:
ava_dataset["mean > 5"] = ava_dataset["mean"] >= 5.0
ava_dataset["mean > 6"] = ava_dataset["mean"] >= 6.5
ava_dataset["mean < 4"] = ava_dataset["mean"] <= 4.5
ava_dataset["mean_2houses"] = ava_dataset["mean"].round(1)
In [6]:
ava_dataset.loc[:,'1':'10'].head()
Out[6]:
In [7]:
ava_dataset.head()
Out[7]:
In [8]:
ava_challenge_counts = ava_dataset.groupby(["Challenge ID"]).size()
ava_challenge_counts.sort_values(ascending=False).head().reset_index()
Out[8]:
Note that the challenge with the most photos has only 1108 instances and it might be too small to be used in the preliminary tests. Let's group the challenges by semantic tags, which are a ggood way to grab pictures with the same category type.
In [9]:
ava_challenge_counts = ava_dataset.groupby(["Semantic Tag 1"]).size().rename('Count')
ava_challenge_counts.sort_values(ascending=False).head().reset_index()
Out[9]:
0 is the absense of a tag, but 15 stands for nature while 14 for landscapes and 1 for abstract. Let's focus on these 3 tags and ignore the rest of the instances.
In [10]:
ava_nature = ava_dataset[ava_dataset["Semantic Tag 1"] == 15]
ava_landscapes = ava_dataset[ava_dataset["Semantic Tag 1"] == 14]
ava_abstract = ava_dataset[ava_dataset["Semantic Tag 1"] == 1]
pd.DataFrame(ava_challenge_counts.rename('Count').reset_index())
ordered_counts = ava_challenge_counts.rename('Count').reset_index().sort_values(by="Count", ascending=False).head(n=20)
ax = seaborn.barplot(ordered_counts["Semantic Tag 1"], ordered_counts["Count"], order=ordered_counts["Semantic Tag 1"])
ax.set(xlabel='Grupo Semântico', ylabel='Contagem')
ax.set_title("Distribuição das imagens por grupo semântico")
Out[10]:
In [11]:
ordered_counts.head()
Out[11]:
In [12]:
ava_abstract.head()
Out[12]:
In [20]:
fig, axs = plt.subplots(ncols=3)
plot_nature = seaborn.countplot(x="mean_2houses", data=ava_nature, ax=axs[0])
plot_landscapes = seaborn.countplot(x="mean_2houses", data=ava_landscapes, ax=axs[1])
plot_abstract = seaborn.countplot(x="mean_2houses", data=ava_abstract, ax=axs[2])
fig.set_size_inches(15.5, 4.5)
def reduce_ticks(plot):
for ind, label in enumerate(plot.get_xticklabels()):
if ind % 10 == 9: # every 10th label is kept
label.set_visible(True)
else:
label.set_visible(False)
reduce_ticks(plot_nature)
reduce_ticks(plot_landscapes)
reduce_ticks(plot_abstract)
plot_nature.set(xlabel = "Média", ylabel="Contagem")
plot_landscapes.set(xlabel = "Média", ylabel="Contagem")
plot_abstract.set(xlabel = "Média", ylabel="Contagem")
plot_nature.set_title("Natureza")
plot_landscapes.set_title("Paisagens")
plot_abstract.set_title("Abstrato")
fig.savefig(filename="Médias")
In [ ]:
plot_landscapes = seaborn.countplot(x="mean_2houses", data=ava_landscapes)
In [ ]:
plot_abstract = seaborn.countplot(x="mean_2houses", data=ava_abstract)
In [ ]:
ava_nature["mean_2houses"].mean()
ava_nature["mean_2houses"].std()